# coding:utf8
from bs4 import BeautifulSoup
import re

class HtmlParser(object):

	def _get_new_urls(self, page_url, soup):
		new_urls = set()

		uls = soup.find_all('a', href = re.compile(r'/ios/[0-9]+/[0-9]+\.html'))

		for ul in uls :
			new_url = ul['href']
			new_urls.add(new_url)

		return new_urls

	def _get_new_data(self, page_url, soup):
		res_data = {}
		res_data['url'] = page_url
		#获取网页内容
		uls = soup.find_all('h2')
		if len(uls) > 0 :
			res_data['title'] = uls[0].get_text()
		return res_data

	def parse(self, page_url, html_cont):

		if page_url is None or html_cont is None:
			return

		soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
		new_urls = self._get_new_urls(page_url,soup)
		new_data = self._get_new_data(page_url,soup)
		return new_urls,new_data